# install.packages("tidyverse")
# install.packages("readxl")
# install.packages("psych")
# install.packages("gridExtra")

library(psych)
library(scales)
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
library(gridExtra)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x ggplot2::%+%()      masks psych::%+%()
## x ggplot2::alpha()    masks scales::alpha(), psych::alpha()
## x readr::col_factor() masks scales::col_factor()
## x dplyr::combine()    masks gridExtra::combine()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
library(readxl)
library(knitr)

Read data

CallCentreUrl <- "data/callcentre_06.xlsx"
HRMUrl <- "data/HRM_06.xlsx"
CallCentreData <- read_excel(CallCentreUrl)
HRMData <- read_excel(HRMUrl)

CallCentre data description

str(CallCentreData)
## tibble [150,937 × 7] (S3: tbl_df/tbl/data.frame)
##  $ time                 : num [1:150937] 2 3 3 3 3 3 3 3 3 3 ...
##  $ length               : num [1:150937] 227 288 189 206 261 166 231 209 200 255 ...
##  $ forwarded            : chr [1:150937] "NA" "NA" "NA" "NA" ...
##  $ customer_satisfaction: num [1:150937] 10 7 7 9 NA 10 NA NA 8 9 ...
##  $ waiting              : num [1:150937] 24 0 54 282 151 10 26 0 212 56 ...
##  $ problem              : chr [1:150937] "delivery" "delivery" "delivery" "delivery" ...
##  $ agent                : num [1:150937] 58 19 53 95 113 16 61 62 27 95 ...

CallCentre data summary

summary(CallCentreData)
##       time          length        forwarded         customer_satisfaction
##  Min.   : 2.0   Min.   :   5.0   Length:150937      Min.   : 1.0         
##  1st Qu.: 9.0   1st Qu.: 198.0   Class :character   1st Qu.: 7.0         
##  Median :12.0   Median : 246.0   Mode  :character   Median : 9.0         
##  Mean   :13.5   Mean   : 311.1                      Mean   : 8.3         
##  3rd Qu.:18.0   3rd Qu.: 399.0                      3rd Qu.:10.0         
##  Max.   :24.0   Max.   :1332.0                      Max.   :10.0         
##                                                     NA's   :42461        
##     waiting         problem              agent       
##  Min.   :  0.00   Length:150937      Min.   :  1.00  
##  1st Qu.:  0.00   Class :character   1st Qu.: 31.00  
##  Median : 29.00   Mode  :character   Median : 58.00  
##  Mean   : 52.93                      Mean   : 59.36  
##  3rd Qu.: 90.00                      3rd Qu.: 88.00  
##  Max.   :561.00                      Max.   :122.00  
## 

HRM data description

str(HRMData)
## tibble [120 × 6] (S3: tbl_df/tbl/data.frame)
##  $ gender       : chr [1:120] "male" "female" "male" "male" ...
##  $ tenure       : num [1:120] 10 51 50 21 5 61 70 7 45 50 ...
##  $ age          : num [1:120] 19 22 23 23 24 24 24 24 25 25 ...
##  $ qualification: chr [1:120] "some college" "apprenticeship" "apprenticeship" "apprenticeship" ...
##  $ ethnicity    : chr [1:120] "Black" "British" "Black" "Black" ...
##  $ agent        : chr [1:120] "109" "65" "67" "92" ...

HRM data summary

summary(HRMData)
##     gender              tenure            age        qualification     
##  Length:120         Min.   :  0.00   Min.   :19.00   Length:120        
##  Class :character   1st Qu.: 26.25   1st Qu.:27.00   Class :character  
##  Mode  :character   Median : 54.50   Median :29.00   Mode  :character  
##                     Mean   : 65.47   Mean   :28.88                     
##                     3rd Qu.: 91.50   3rd Qu.:31.00                     
##                     Max.   :229.00   Max.   :38.00                     
##   ethnicity            agent          
##  Length:120         Length:120        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Number of call by time

CallCentreData %>% count(time)
 options(scipen = 3)
CallCentreData %>% 
    count(time) %>%
    mutate(percentage = n / sum(n) * 100)
CallCentreData %>% count(time) %>% 
  ggplot(. , aes(x = time, y = n ))  + 
  geom_bar( stat="identity", position=position_dodge(),
            fill="#4126de", color="#e9ecef", alpha=0.9) +
            ylab("Number of call") + 
            xlab("Time of the day (in hours)") +
            geom_text(aes(label=n),
                             vjust= - 0.5,
                             position = position_dodge(0.9),
                             size=2.5) +
            ggtitle("Number Of Call By Time") +
            theme_minimal()

Average duration of the call by time

CallCentreData %>% group_by(time) %>% summarise(mean = mean(length, na.rm = TRUE))
CallCentreData %>% group_by(time) %>% summarise(mean = round(mean(length, na.rm = TRUE))) %>% 
  ggplot(. , aes(x = time, y = mean ))  + 
  geom_bar( stat="identity", position=position_dodge(),
            fill="#69b3a2", color="#e9ecef", alpha=0.9) +
            ylab("Average length of time") + 
            xlab("Time of the day (in hours)") +
            geom_text(aes(label= mean),
                             vjust= - 0.5,
                             position = position_dodge(0.9),
                             size=2.5) +
            geom_hline(yintercept = mean(CallCentreData$length), color="#fc0303") +
            geom_text(aes(y= mean(CallCentreData$length), x= 1),
                      label= paste("Average of all time:", round(mean(CallCentreData$length))),
                      hjust= 0, vjust= -1, size= 3.5)+ 
            ggtitle("Average Length Of Time Of The Call By Time") +
            theme_minimal() 

% of problem

CallCentreData %>% 
                  count(problem)  %>%
                  mutate(percentage= n/sum(n)*100) %>%
                  arrange(desc(n))
CallCentreData %>% 
                  count(problem)  %>%
                  mutate(percentage= n/sum(n)*100) %>%
                  mutate(percentage= round(percentage, 1)) %>%
                  mutate(labels=  paste(problem, " (", percentage, "%)", sep = "" )) -> problemPie

library(RColorBrewer)
myPalette <- brewer.pal(5, "Set2") 
pie(problemPie$n , labels = problemPie$labels, border="white", col=myPalette,
    main = "Percentage Of Problem")

% of forward

CallCentreData %>% 
                  mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%
                  count(forwarded)  %>%
                  mutate(percentage= n/sum(n)*100)
CallCentreData %>% 
                  mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%
                  count(forwarded)  %>%
                  mutate(percentage= n/sum(n)*100) %>%
                  mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
                  mutate(percentage= round(percentage, 1)) %>%
  
ggplot(., aes(x="", y= percentage, fill= forwarded)) +
        geom_bar(stat="identity", width=1, color="white") +
        coord_polar("y", start=0) +
        theme_void(base_size = 13) + theme(legend.position = "none") +
        ggtitle("Percentage Forwarded") +
        geom_text(aes(y = yposition, label = paste(forwarded, "\n", n, " (", percentage, "%)", sep = "")),
                  color = "white", size= 4) +
        scale_fill_brewer(palette="Set1")

forward by time

CallCentreData %>% 
  mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%                
  count(time, forwarded) %>% spread(forwarded, n) %>%
  replace_na(list("forwarded" = 0, "not forwarded" = 0))
CallCentreData %>% 
  mutate(forwarded = recode(forwarded, "forwarded" = "forwarded", "NA" = "not forwarded")) %>%                
  count(time, forwarded) %>%
ggplot(., aes(x= time, y= n, fill= forwarded)) +
        geom_bar(stat="identity") + 
        theme(legend.title=element_blank()) +
        ggtitle("Forwarded By Time")+
        ylab("Average length of time") + 
        xlab("Time of the day (in hours)")

Overoll satisfaction

summary(CallCentreData$customer_satisfaction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     1.0     7.0     9.0     8.3    10.0    10.0   42461
boxplot(CallCentreData$customer_satisfaction,
main = "Satisfaction With The Service",
xlab = "Customer satisfaction score",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)

CallCentreData %>% filter(!is.na(customer_satisfaction)) %>%
  mutate(satisfaction_group = ifelse(customer_satisfaction < 5, "Low",
                                     ifelse(customer_satisfaction < 8, "Middle", "Hight")) ) %>%
  count(satisfaction_group) %>% 
  mutate(percentage= n/sum(n)*100)
CallCentreData %>% filter(!is.na(customer_satisfaction)) %>%
  mutate(satisfaction_group = ifelse(customer_satisfaction < 5, "Low",
                                     ifelse(customer_satisfaction < 8, "Middle", "Hight")) ) %>%
  count(satisfaction_group) %>% 
  mutate(percentage= n/sum(n)*100)
CallCentreData %>% 
                  group_by(customer_satisfaction)  %>%
                  count()
CallCentreData %>% count(customer_satisfaction) %>% 
  ggplot(. , aes(x = customer_satisfaction, y = n ))  + 
  geom_bar( stat="identity", position=position_dodge(), alpha=0.9) +
            ylab("Number of call") + 
            xlab("Customer satisfaction score") +
            geom_text(aes(label=n),
                             vjust= - 0.5,
                             position = position_dodge(0.9),
                             size=2.5) +
            ggtitle("Satisfaction With The Service") +
            theme_minimal()
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

satisfaction by duration

CallCentreData %>% 
                  group_by(customer_satisfaction)  %>%
                  summarise(mean_length = mean(length),
                            mean_waiting = mean(waiting))
ggplot(CallCentreData, aes(y= length, x = customer_satisfaction)) + 
    geom_point(aes(color = "blue"), alpha = 0.8) + 
    geom_smooth(formula = y~x, method="lm") + 
    ggtitle("Satisfaction By Duration Of The Call") +
    xlab("Customer satisfaction score") + 
    ylab("Duration of the call") +
    theme_bw(base_size =  12) + theme(legend.position = "none") 
## Warning: Removed 42461 rows containing non-finite values (stat_smooth).
## Warning: Removed 42461 rows containing missing values (geom_point).

ggplot( CallCentreData, aes(y= as_factor(customer_satisfaction), fill= as_factor(customer_satisfaction), x= length)) +
  geom_boxplot()  +
  ggtitle("Satisfaction By Duration Of The Call") +
  xlab("Duration of the call") + 
  ylab("Customer satisfaction score") +
  theme(legend.position = "none") 

satisfaction by waiting

ggplot(CallCentreData, aes(y= waiting, x = customer_satisfaction)) + 
    geom_point(aes(color = "blue"), alpha = 0.8) + 
    geom_smooth(formula = y~x, method="lm") + 
    ggtitle("Satisfaction By Waiting Time") +
    xlab("Customer satisfaction score") + 
    ylab("Waiting time") +
    theme_bw(base_size =  12) + theme(legend.position = "none") 
## Warning: Removed 42461 rows containing non-finite values (stat_smooth).
## Warning: Removed 42461 rows containing missing values (geom_point).

ggplot( CallCentreData, aes(y= as_factor(customer_satisfaction), fill= as_factor(customer_satisfaction), x= waiting)) +
  geom_boxplot()  +
  ggtitle("Satisfaction By Waiting Time") +
  xlab("Waiting time") + 
  ylab("Customer satisfaction score") +
  theme(legend.position = "none") 

satisfaction by problem

CallCentreData %>% 
                  group_by(problem)  %>%
                  summarise(
                            Min = min(customer_satisfaction, na.rm = TRUE),
                            "1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
                            Median = median(customer_satisfaction, na.rm = TRUE),
                            Mean = mean(customer_satisfaction, na.rm = TRUE),
                            "3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
                            Max = max(customer_satisfaction, na.rm = TRUE),
                            Missing = sum(is.na(customer_satisfaction))
)
ggplot( CallCentreData, aes(y= problem, fill=problem, x= customer_satisfaction)) +
  geom_boxplot()  +
  ggtitle("Satisfaction By Problem") +
  ylab("") + 
  xlab("Customer satisfaction score") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())
## Warning: Removed 42461 rows containing non-finite values (stat_boxplot).

waiting time

summary(CallCentreData$waiting)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   29.00   52.93   90.00  561.00
boxplot(CallCentreData$waiting,
main = "Waiting time",
xlab = "Waiting time in second",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)

CallCentreData %>% filter(!is.na(waiting)) %>%
  mutate(waiting_group = ifelse(waiting < 60, "< 60s",
                                     ifelse(waiting <= 120, "60s - 120s", "> 120s")) ) %>%
  count(waiting_group) %>% 
  mutate(percentage= n/sum(n)*100)

agent performance by munber of call

CallCentreData %>% 
                  count(agent)  %>%
                  arrange(desc(n))
CallCentreData %>% 
                  count(agent)  %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% head(10) %>% 
                  mutate(agent = fct_reorder(agent, n)) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
  geom_point( color="green", size=4) +
  theme_light() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  xlab("") + xlim(0, 2500) +
  ylab("") + ggtitle("Top 10") -> Top10byCall

CallCentreData %>% 
                  count(agent)  %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% tail(10) %>% 
                  mutate(agent = fct_reorder(agent, n)) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
  geom_point( color="red", size=4) +
  theme_light() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank()
  ) +
  xlab("") + xlim(0, 2500) +
  ylab("") + ggtitle("Bottom 10")  -> Tail10byCall

grid.arrange(Top10byCall, Tail10byCall, nrow= 1, ncol = 2, 
             top = "Agent Ranking By Number Of Call")

agent performance by sum duration of the call

CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise("Sum_length" = sum(length, na.rm = TRUE)) %>%
                  arrange(desc(Sum_length))
CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise(n = sum(length, na.rm = TRUE)) %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
                  head(10) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
  geom_point( color="green", size=4) +
  theme_light() +
  theme(
    panel.border = element_blank(),
  ) +
  xlab("") + ggtitle("Top 10") + scale_x_continuous(labels = number, limits= c(0, 800000) ) +
  ylab("") -> Top10byDuration

CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise(n = sum(length, na.rm = TRUE)) %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
                  tail(10) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
  geom_point( color="red", size=4) +
  theme_light() +
  theme(
    panel.border = element_blank(),
  ) +
  xlab("") + ggtitle("Bottom 10") + scale_x_continuous(labels = number, limits= c(0, 800000) ) +
  ylab("")  -> Tail10byDuration

grid.arrange(Top10byDuration, Tail10byDuration, nrow= 1, ncol = 2, 
             top = "Agent Ranking By Sum Duration Of The Call")

agent performance by satisfaction

CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise("Mean_satisfaction" = mean(customer_satisfaction, na.rm = TRUE)) %>%
                  arrange(desc(Mean_satisfaction))
CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise(n = mean(customer_satisfaction, na.rm = TRUE)) %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
                  head(10) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
  geom_point( color="green", size=4) +
  theme_light() +
  theme(
    panel.border = element_blank(),
  ) +
  xlab("") + ggtitle("Top 10") + scale_x_continuous(labels = number, limits= c(0, 10) ) +
  ylab("") -> Top10bySatisfaction

CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise(n = mean(customer_satisfaction, na.rm = TRUE)) %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, n)) %>%
                  tail(10) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
  geom_point( color="red", size=4) +
  theme_light() +
  theme(
    panel.border = element_blank(),
  ) +
  xlab("") + ggtitle("Bottom 10") + scale_x_continuous(labels = number, limits= c(0, 10) ) +
  ylab("")  -> Tail10bySatisfaction

grid.arrange(Top10bySatisfaction, Tail10bySatisfaction, nrow= 1, ncol = 2, 
             top = "Agent Ranking By Customer Satisfaction")

agent performance by waiting time

CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise("Mean_waiting" = mean(waiting, na.rm = TRUE)) %>%
                  arrange(desc(Mean_waiting))
CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise(n = mean(waiting, na.rm = TRUE)) %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, desc(n))) %>%
                  head(10) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="red") +
  geom_point( color="red", size=4) +
  theme_light() +
  theme(
    panel.border = element_blank(),
  ) +
  xlab("") + ggtitle("Top 10") + scale_x_continuous(labels = number, limits= c(0, 60) ) +
  ylab("") -> Tail10byWaiting

CallCentreData %>% 
                  group_by(agent)  %>%
                  summarise(n = mean(waiting, na.rm = TRUE)) %>%
                  mutate(agent = paste("Agent", agent)) %>%
                  arrange(desc(n)) %>% mutate(agent = fct_reorder(agent, desc(n))) %>%
                  tail(10) %>%
  ggplot(., aes(x = n, y = agent)) +
  geom_segment( aes(x= 0, xend= n, y= agent, yend= agent), color="blue") +
  geom_point( color="green", size=4) +
  theme_light() +
  theme(
    panel.border = element_blank(),
  ) +
  xlab("") + ggtitle("Bottom 10") + scale_x_continuous(labels = number, limits= c(0, 60) ) +
  ylab("")  -> Top10byWaiting

grid.arrange(Top10byWaiting, Tail10byWaiting, nrow= 1, ncol = 2, 
             top = "Agent Ranking By Waiting Time")

agent by problem

CallCentreData %>% 
                  group_by(agent, problem)  %>%
                  count(problem)%>%
                  spread(problem, n)

allocation of agent by gender

HRMData %>% 
           count(gender)  %>%
           mutate(percentage= n/sum(n)*100)
HRMData %>% 
           filter(!is.na(gender)) %>%         
           count(gender)  %>%
           arrange(desc(n)) %>%
           mutate(percentage= n/sum(n)*100) %>%
           mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
           mutate(percentage= round(percentage, 1)) %>%
  
ggplot(., aes(x="", y= percentage, fill= gender)) +
        geom_bar(stat="identity", width=1, color="white") +
        coord_polar("y", start=0) +
        theme_void(base_size = 13) + theme(legend.position = "none") +
        ggtitle("Percentage Gender") +
        geom_text(aes(y = yposition, label = paste(gender, "\n", n, " (", percentage, "%)", sep = "")),
                  color = "white", size= 4) 

agent by tenure

summary(HRMData$tenure)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   26.25   54.50   65.47   91.50  229.00
# color=group, fill=group
ggplot(HRMData, aes(x=tenure)) +
    geom_histogram(aes(y=..density..), fill="#999999", bins=20, position="identity", alpha=0.7)+
    geom_density(alpha=0.6, color="darkblue", fill="lightblue")+
    labs(title="Distribution Of Agent Tenure",x="Number of months", y = "Density")+  
    theme_classic(base_size = 14) +
    geom_vline(xintercept = mean(HRMData$tenure), color="#fc0303") +
    geom_text(aes(y= 0, x= mean(tenure)),
                      label= paste("Mean:", round(mean(HRMData$tenure))),
                      hjust= -0.1 , vjust= 0, size= 3.5)

agent by qualification

HRMData %>%
  count(qualification) %>%
  arrange(desc(n))
HRMData %>% 
           filter(qualification != "NA") %>%         
           count(qualification)  %>%
           arrange(desc(n)) %>%
           mutate(percentage= n/sum(n)*100) %>%
           mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
           mutate(percentage= round(percentage, 1)) %>%
  
ggplot(., aes(x="", y= percentage, fill= qualification)) +
        geom_bar(stat="identity", width=1, color="white") +
        coord_polar("y", start=0) +
        theme_void(base_size = 10) + theme(legend.position = "none") +
        ggtitle("Percentage Of Education Degree") +
        geom_text(aes(y = yposition, label = paste(qualification, "\n", n, " (", percentage, "%)", sep = "")),
                  color = "white", size= 4) 

agent by ethnicity

HRMData %>%
  count(ethnicity) %>%
  arrange(desc(n))
HRMData %>% 
           filter(!is.na(ethnicity)) %>%         
           count(ethnicity)  %>%
           arrange(desc(n)) %>%
           mutate(percentage= n/sum(n)*100) %>%
           mutate(yposition = cumsum(percentage)- 0.5*percentage ) %>%
           mutate(percentage= round(percentage, 1)) %>%
  
ggplot(., aes(x="", y= percentage, fill= ethnicity)) +
        geom_bar(stat="identity", width=1, color="white") +
        coord_polar("y", start=0) +
        theme_void(base_size = 10) + theme(legend.position = "none") +
        ggtitle("Percentage Of Employee’s Ethnic") +
        geom_text(aes(y = yposition, label = paste(ethnicity, "\n", n, " (", percentage, "%)", sep = "")),
                  color = "white", size= 4) 

Combine Data

CallCentreData$agent <- as.numeric(CallCentreData$agent)
HRMData$agent <- as.numeric(HRMData$agent)
CombineData <- left_join(CallCentreData, HRMData, by = "agent")

satisfaction by gender

CombineData %>% 
                  group_by(gender)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(customer_satisfaction, na.rm = TRUE),
                            "1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
                            Median = median(customer_satisfaction, na.rm = TRUE),
                            Mean = mean(customer_satisfaction, na.rm = TRUE),
                            "3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
                            Max = max(customer_satisfaction, na.rm = TRUE),
                            Missing = sum(is.na(customer_satisfaction))
)
CombineData %>% filter(!is.na(gender)) %>% filter(gender != "NA") %>%
  ggplot( ., aes(y= gender, fill=gender, x= customer_satisfaction)) +
  geom_boxplot()  +
  ggtitle("Satisfaction By Sex") +
  ylab("Sex") + 
  xlab("Satisfaction") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())
## Warning: Removed 41385 rows containing non-finite values (stat_boxplot).

t.test(data = CombineData, customer_satisfaction ~ gender)
## 
##  Welch Two Sample t-test
## 
## data:  customer_satisfaction by gender
## t = 1.4612, df = 57281, p-value = 0.144
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.005965469  0.040919404
## sample estimates:
## mean in group female   mean in group male 
##             8.315402             8.297925

Duration by gender

CombineData %>% 
                  group_by(gender)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(length, na.rm = TRUE),
                            "1st Quantile" = quantile(length, probs = c(0.25), na.rm = TRUE),
                            Median = median(length, na.rm = TRUE),
                            Mean = mean(length, na.rm = TRUE),
                            "3st Quantile" = quantile(length, probs = c(0.75), na.rm = TRUE),
                            Max = max(length, na.rm = TRUE),
                            Missing = sum(is.na(length))
)
CombineData %>% filter(!is.na(gender)) %>% filter(gender != "NA") %>%
  ggplot( ., aes(y= gender, fill=gender, x= length)) +
  geom_boxplot()  +
  ggtitle("Duration By Sex") +
  ylab("Sex") + 
  xlab("Duration") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

t.test(data = CombineData, length ~ gender)
## 
##  Welch Two Sample t-test
## 
## data:  length by gender
## t = 1.0982, df = 79315, p-value = 0.2721
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.9074283  3.2201456
## sample estimates:
## mean in group female   mean in group male 
##             312.0080             310.8516

Waiting time by gender

CombineData %>% 
                  group_by(gender)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(waiting, na.rm = TRUE),
                            "1st Quantile" = quantile(waiting, probs = c(0.25), na.rm = TRUE),
                            Median = median(waiting, na.rm = TRUE),
                            Mean = mean(waiting, na.rm = TRUE),
                            "3st Quantile" = quantile(waiting, probs = c(0.75), na.rm = TRUE),
                            Max = max(waiting, na.rm = TRUE),
                            Missing = sum(is.na(waiting))
)
CombineData %>% filter(!is.na(gender)) %>% filter(gender != "NA") %>%
  ggplot( ., aes(y= gender, fill=gender, x= customer_satisfaction)) +
  geom_boxplot()  +
  ggtitle("Waiting Time By Sex") +
  ylab("Sex") + 
  xlab("Waiting time") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())
## Warning: Removed 41385 rows containing non-finite values (stat_boxplot).

t.test(data = CombineData, waiting ~ gender)
## 
##  Welch Two Sample t-test
## 
## data:  waiting by gender
## t = 1.1163, df = 78806, p-value = 0.2643
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3102183  1.1310442
## sample estimates:
## mean in group female   mean in group male 
##             53.30955             52.89913

satisfaction by qualification

CombineData %>% 
                  group_by(qualification)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(customer_satisfaction, na.rm = TRUE),
                            "1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
                            Median = median(customer_satisfaction, na.rm = TRUE),
                            Mean = mean(customer_satisfaction, na.rm = TRUE),
                            "3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
                            Max = max(customer_satisfaction, na.rm = TRUE),
                            Missing = sum(is.na(customer_satisfaction))
)
CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
  ggplot( ., aes(y= qualification, fill=qualification, x= customer_satisfaction)) +
  geom_boxplot()  +
  ggtitle("Satisfaction By Qualification") +
  ylab("Qualification") + 
  xlab("Satisfaction") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())
## Warning: Removed 40355 rows containing non-finite values (stat_boxplot).

CombineData %>% filter(qualification != "NA") %>%
aov(data = ., customer_satisfaction ~ qualification) %>% summary()
##                   Df Sum Sq Mean Sq F value Pr(>F)
## qualification      2      2   1.156   0.369  0.692
## Residuals     103020 323246   3.138               
## 40355 observations deleted due to missingness

Duration by qualification

CombineData %>% 
                  group_by(qualification)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(length, na.rm = TRUE),
                            "1st Quantile" = quantile(length, probs = c(0.25), na.rm = TRUE),
                            Median = median(length, na.rm = TRUE),
                            Mean = mean(length, na.rm = TRUE),
                            "3st Quantile" = quantile(length, probs = c(0.75), na.rm = TRUE),
                            Max = max(length, na.rm = TRUE),
                            Missing = sum(is.na(length))
)
CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
  ggplot( ., aes(y= qualification, fill=qualification, x= length)) +
  geom_boxplot()  +
  ggtitle("Duration By Qualification") +
  ylab("Qualification") + 
  xlab("Duration") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

CombineData %>% filter(qualification != "NA") %>%
aov(data = ., length ~ qualification) %>% summary()
##                   Df     Sum Sq Mean Sq F value Pr(>F)
## qualification      2      55555   27777   0.825  0.438
## Residuals     143375 4824590339   33650

Waiting time by qualification

CombineData %>% 
                  group_by(qualification)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(waiting, na.rm = TRUE),
                            "1st Quantile" = quantile(waiting, probs = c(0.25), na.rm = TRUE),
                            Median = median(waiting, na.rm = TRUE),
                            Mean = mean(waiting, na.rm = TRUE),
                            "3st Quantile" = quantile(waiting, probs = c(0.75), na.rm = TRUE),
                            Max = max(waiting, na.rm = TRUE),
                            Missing = sum(is.na(waiting))
)
CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
  ggplot( ., aes(y= qualification, fill=qualification, x= waiting)) +
  geom_boxplot()  +
  ggtitle("Waiting Time By Qualification") +
  ylab("Qualification") + 
  xlab("Waiting time") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

CombineData %>% filter(!is.na(qualification)) %>% filter(qualification != "NA") %>%
  ggplot( ., aes(y= qualification, fill=qualification, x= waiting)) +
  geom_boxplot()  +
  ggtitle("Waiting Time By Qualification") +
  ylab("Qualification") + 
  xlab("Waiting time") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

CombineData %>% filter(qualification != "NA") %>%
aov(data = ., waiting ~ qualification) %>% summary()
##                   Df    Sum Sq Mean Sq F value Pr(>F)  
## qualification      2     23169   11585    2.84 0.0584 .
## Residuals     143375 584826332    4079                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

satisfaction by ethnicity

CombineData %>% 
                  group_by(ethnicity)  %>%
                  summarise(
                            Count = n(),
                            Min = min(customer_satisfaction, na.rm = TRUE),
                            "1st Quantile" = quantile(customer_satisfaction, probs = c(0.25), na.rm = TRUE),
                            Median = median(customer_satisfaction, na.rm = TRUE),
                            Mean = mean(customer_satisfaction, na.rm = TRUE),
                            "3st Quantile" = quantile(customer_satisfaction, probs = c(0.75), na.rm = TRUE),
                            Max = max(customer_satisfaction, na.rm = TRUE),
                            Missing = sum(is.na(customer_satisfaction))
)
CombineData %>% filter(!is.na(ethnicity)) %>%
ggplot( ., aes(y= ethnicity, fill=ethnicity, x= length)) +
  geom_boxplot()  +
  ggtitle("Satisfaction By Ethnicity") +
  ylab("Ethnicity") + 
  xlab("Satisfaction") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

aov(data = CombineData, customer_satisfaction ~ ethnicity) %>% summary()
##                 Df Sum Sq Mean Sq F value Pr(>F)
## ethnicity        3      3  0.9246   0.295  0.829
## Residuals   103484 324615  3.1369               
## 47449 observations deleted due to missingness

Duration by ethnicity

CombineData %>% 
                  group_by(ethnicity)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(length, na.rm = TRUE),
                            "1st Quantile" = quantile(length, probs = c(0.25), na.rm = TRUE),
                            Median = median(length, na.rm = TRUE),
                            Mean = mean(length, na.rm = TRUE),
                            "3st Quantile" = quantile(length, probs = c(0.75), na.rm = TRUE),
                            Max = max(length, na.rm = TRUE),
                            Missing = sum(is.na(length))
)
CombineData %>% filter(!is.na(ethnicity)) %>%
ggplot( ., aes(y= ethnicity, fill=ethnicity, x= length)) +
  geom_boxplot()  +
  ggtitle("Duration By Ethnicity") +
  ylab("Ethnicity") + 
  xlab("Duration") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

aov(data = CombineData, length ~ ethnicity) %>% summary()
##                 Df     Sum Sq Mean Sq F value Pr(>F)
## ethnicity        3       6373    2124   0.063  0.979
## Residuals   143855 4830335219   33578               
## 7078 observations deleted due to missingness

Waiting time by ethnicity

CombineData %>% 
                  group_by(ethnicity)  %>%
                  summarise(
                            Count = n(),        
                            Min = min(waiting, na.rm = TRUE),
                            "1st Quantile" = quantile(waiting, probs = c(0.25), na.rm = TRUE),
                            Median = median(waiting, na.rm = TRUE),
                            Mean = mean(waiting, na.rm = TRUE),
                            "3st Quantile" = quantile(waiting, probs = c(0.75), na.rm = TRUE),
                            Max = max(waiting, na.rm = TRUE),
                            Missing = sum(is.na(waiting))
)
CombineData %>% filter(!is.na(ethnicity)) %>%
ggplot( ., aes(y= ethnicity, fill=ethnicity, x= waiting)) +
  geom_boxplot()  +
  ggtitle("Waiting Time By Ethnicity") +
  ylab("Ethnicity") + 
  xlab("Waiting time") + 
  theme(
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

aov(data = CombineData, waiting ~ ethnicity) %>% summary()
##                 Df    Sum Sq Mean Sq F value Pr(>F)
## ethnicity        3     21006    7002   1.719  0.161
## Residuals   143855 585932514    4073               
## 7078 observations deleted due to missingness
library(psych)
options(repr.plot.width = 12, repr.plot.height = 12)
pairs.panels(CombineData[,c("length", "waiting", "customer_satisfaction", "age", "tenure")], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = TRUE,  # show density plots
             ellipses = TRUE # show correlation ellipses
             )